###############################################################
# 1) Male and female graduate enrollment (1967 - 2015)
###############################################################

# Creating a simplified data frame excluding non profit and for profit schools as well as irrelavent totals
grad_enroll_ftf <- grad_enroll %>% 
  select(Year, full_time_m, full_time_f, part_time_m, part_time_f)

#creating a linear model for male graduate enrollment
male_grad <- lm(grad_enroll$total_males ~ grad_enroll$Year)

summary(male_grad)
## 
## Call:
## lm(formula = grad_enroll$total_males ~ grad_enroll$Year)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96461 -34861 -12841  51876  95766 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -17112153    1087024  -15.74   <2e-16 ***
## grad_enroll$Year      9069        546   16.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54050 on 47 degrees of freedom
## Multiple R-squared:  0.8545, Adjusted R-squared:  0.8514 
## F-statistic:   276 on 1 and 47 DF,  p-value: < 2.2e-16
plot(male_grad)

# Correlation Testing for male graduates:
cor(x = grad_enroll$Year, y = grad_enroll$total_males, use = "everything", method = c("pearson"))
## [1] 0.9243741
# Pearson's R = 0.92

#creating a linear model for female graduate enrollment
female_grad <- lm(grad_enroll$total_females ~ grad_enroll$Year)

summary(female_grad)
## 
## Call:
## lm(formula = grad_enroll$total_females ~ grad_enroll$Year)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89397 -48101  -7633  45267 129727 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -5.896e+07  1.161e+06  -50.77   <2e-16 ***
## grad_enroll$Year  3.013e+04  5.832e+02   51.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57730 on 47 degrees of freedom
## Multiple R-squared:  0.9827, Adjusted R-squared:  0.9823 
## F-statistic:  2669 on 1 and 47 DF,  p-value: < 2.2e-16
plot(female_grad)

# Correlation Testing for male graduates:
cor(x = grad_enroll$Year, y = grad_enroll$total_females, method = c("pearson"))
## [1] 0.9913086
# Pearson's R = 0.99

#create a graph of the models
enroll_graph <- grad_enroll %>% 
  ggplot(aes(x = Year, y = total_males))+
  geom_point(aes(color = "total_males"))+
  geom_smooth(aes(x = Year, y = total_males),method = lm, se = TRUE, size = 0.5, color = "gray20") + #plots the linear model with a confidence interval (se)
  geom_point(aes(x = Year, y = total_females, color = "total_females")) +
  geom_smooth(aes(x = Year, y = total_females),method = lm, se = TRUE, size = 0.5, color = "gray20")+
  theme_classic() +
  theme(legend.title=element_blank()) +
  scale_color_manual(" ", breaks = c("total_males", "total_females"), values = c("total_males" = "royalblue1", "total_females" = "palevioletred1"), labels=c("Males", "Females")) +
  labs(x = "\nYear", y = "Total Enrollment\n", title = "Male and Female Graduate Enrollment (1967-2015)\n") +
  scale_x_continuous(expand = c(0,0), limits = c(1967, 2015)) +
  theme(text = element_text(family = "Times New Roman")) + 
  theme(plot.title = element_text(hjust = 0.5, size = 13)) +
  theme(axis.title = element_text(size = 12), axis.text = element_text(size = 10))

enroll_graph

###############################################################
# 2) Shifts in female PhD recipients by field (1985, 2000, and 2015)
###############################################################

# First creating a data frame with just the 5 fields in question and just the 3 years of interest.
phds_summary <- phds %>%
  select("year", "physci_f", "engineer_f", "ed_f", "humart_f") %>% 
  filter(year == "1985" | year == "2000" | year == "2015") %>% 
  select("physci_f", "engineer_f", "ed_f", "humart_f") # I realize this line of code is redundant, but just put it in for my own understanding of the order in which things occurred.
rownames(phds_summary) <- c("1985", "2000", "2015")
## Warning: Setting row names on a tibble is deprecated.
#maybe a chisquare tests for proportions of females in each field by year
phds_chi <- chisq.test(phds_summary)
phds_chi
## 
##  Pearson's Chi-squared test
## 
## data:  phds_summary
## X-squared = 2073.3, df = 6, p-value < 2.2e-16
phds_prop <- prop.table(as.matrix(phds_summary), 1)

phds_summary2 <- phds %>%
  select("year", "physci_f", "engineer_f", "ed_f", "humart_f") %>% 
  filter(year == "1985" | year == "2000" | year == "2015") 

phds_summary3<-melt(phds_summary2, id.vars = 'year')

phds_graph<- phds_summary3 %>% 
  ggplot(aes(fill = variable, y = value, x = year)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_x_continuous(breaks = c(1985, 2000, 2015)) +
  scale_y_continuous(expand = c(0,0)) +
  theme_classic() +
  labs(x = "\nYear", y = "Proportion\n", title = "Proportion of Female PhD Recipients\n") +
  theme(text = element_text(family = "Times New Roman")) + 
  theme(plot.title = element_text(hjust = 0.5, size = 13)) +
  theme(axis.title = element_text(size = 12), axis.text = element_text(size = 10)) +
  scale_fill_manual(values = c("lightpink1", "palevioletred1", "violetred", "maroon4"), labels = c("Physical and Earth Sciences", "Engineering", "Education", "Humanities and Arts")) +
  theme(legend.title=element_blank())

phds_graph

# p-value < 0.001 therefore there is a significant association between the year a degree was awarded and the number of PhDs awarded to women in that year (X^2 = 2073, *p* , 0.001)
###############################################################
# 3) Male and female salaries for starting postdoctoral and other employment positions (2015)
###############################################################

#2 mann whitney u tests (one for median postdoc salary male vs female, one for median other employment male vs female)

#explore data for posdoc salaries
male_post_sal <- ggplot(med_salary, aes(x = postdoc_m)) +
  geom_histogram(binwidth = 5000, aes(color = "black"))
male_post_sal #not normally distributed

# Checking the qq plot for this distribution
male_post_qq <- ggplot(med_salary, aes(sample = postdoc_m)) +
  geom_qq()
male_post_qq

#Looking like it's potentially linear

female_post_sal <- ggplot(med_salary, aes(x = postdoc_f)) +
  geom_histogram(binwidth = 5000, aes(color = "black"))
female_post_sal #not normally distributed

# Checking the qq plot for this distribution
female_post_qq <- ggplot(med_salary, aes(sample = postdoc_f)) +
  geom_qq()
female_post_qq

# NOT looking linear

#want comparison of means so do Mann Whitney U
#H0: Ranks are equal
#HA: ranks are different
post_sal_test <- wilcox.test(med_salary$postdoc_f, med_salary$postdoc_m, paired = TRUE)
## Warning in wilcox.test.default(med_salary$postdoc_f,
## med_salary$postdoc_m, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_salary$postdoc_f,
## med_salary$postdoc_m, : cannot compute exact p-value with zeroes
post_sal_test
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  med_salary$postdoc_f and med_salary$postdoc_m
## V = 16.5, p-value = 0.8884
## alternative hypothesis: true location shift is not equal to 0
med_sal_2<-melt(med_salary, id.vars = 'field')

post_sal_graph <- med_sal_2 %>%  
  filter(variable == "postdoc_m" | variable == "postdoc_f") %>% 
  ggplot(aes(x = field, y = value)) +
  geom_col(aes(fill = variable), position = "dodge") +
  scale_fill_manual(values = c("royalblue1", "palevioletred1"), labels = c("Male", "Female")) +
  theme_classic() +
  coord_flip() +
  theme(axis.text.x = element_text(hjust = 0.5, size = 8)) +
  theme(axis.text.y = element_text(hjust = 1, vjust = 0.5, size = 8)) +
  theme(legend.title=element_blank())+
  labs(x = "\nField", y = "Median Salary\n", title = "Male and Female Median Postdoc Salaries 2015\n") +
  theme(text = element_text(family = "Times New Roman")) + 
  theme(plot.title = element_text(hjust = 0.5, size = 13)) +
  theme(axis.title = element_text(size = 12)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(labels = function(y) lapply(strwrap(y, width = 25, simplify = FALSE), paste, collapse="\n"))

post_sal_graph

#explore data for employement salaries
male_employ_sal <- ggplot(med_salary, aes(x = employment_m)) +
  geom_histogram(binwidth = 15000)
male_employ_sal #maybe normally distributed?

female_employ_sal <- ggplot(med_salary, aes(x = employment_f)) +
  geom_histogram(binwidth = 15000)
female_employ_sal #maybe normally distributed?

employ_sal_test <- wilcox.test(med_salary$employment_f, med_salary$employment_m, paired = TRUE)
## Warning in wilcox.test.default(med_salary$employment_f,
## med_salary$employment_m, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_salary$employment_f,
## med_salary$employment_m, : cannot compute exact p-value with zeroes
employ_sal_test
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  med_salary$employment_f and med_salary$employment_m
## V = 4, p-value = 0.002572
## alternative hypothesis: true location shift is not equal to 0
#there is a significant difference, p = .003, so test cliffs delta
employ_sal_cliffs <- cliff.delta(med_salary$employment_f, med_salary$employment_m)
employ_sal_cliffs
## 
## Cliff's Delta
## 
## delta estimate: -0.2133333 (small)
## 95 percent confidence interval:
##        inf        sup 
## -0.5633695  0.2016324
employ_sal_graph <- med_sal_2 %>%  
  filter(variable == "employment_m" | variable == "employment_f") %>% 
  ggplot(aes(x = field, y = value)) +
  coord_flip() +
  geom_col(aes(fill = variable), position = "dodge") +
  scale_fill_manual(values = c("royalblue1", "palevioletred1"), labels = c("Male", "Female")) +
  theme_classic() +
  theme(axis.text.x = element_text(hjust = 0.5, size = 8)) +
  theme(axis.text.y = element_text(hjust = 1, vjust = 0.5, size = 8)) +
  theme(legend.title=element_blank())+
  labs(x = "\nField", y = "Median Salary\n", title = "Male and Female Median non Postdoc Salaries 2015\n") +
  theme(text = element_text(family = "Times New Roman")) + 
  theme(plot.title = element_text(hjust = 0.5, size = 13)) +
  theme(axis.title = element_text(size = 12)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(labels = function(y) lapply(strwrap(y, width = 25, simplify = FALSE), paste, collapse="\n"))

employ_sal_graph

###############################################################
# 4) Exploring academic salaries for professors in U.S. colleges
###############################################################

# Find a good multiple linear regression model with the output being salary, and the variables (some combination of) sex, discipline, rank, years of service, years since phd 

# Some data exploration 
#plot of salary v years service
salary_plot <- ggplot (salary_data, aes(x=salary, y=yrs.service))+
  geom_point(aes(color=sex), alpha=0.6)
salary_plot

#plot of salary v years service
salary_plot2 <- ggplot (salary_data, aes(x=salary, y=yrs.since.phd))+
  geom_point(aes(color=sex), alpha=0.6)
salary_plot2

#plot of salary by faculty rank
facultyrank_plot <- ggplot (salary_data, aes(x=faculty.rank, y=salary))+
  geom_point(aes(color=sex))
facultyrank_plot

# well this doesn't seem to be saying much, not seeing any glaring trends here

# summary table looking at salary by sex and faculty position... interesting? maybe.
avg_salary_sex <- salary_data %>% 
  group_by(sex, faculty.rank) %>% 
  summarize(mean= mean(salary),
            count=n())
avg_salary_sex

A tibble: 6 x 4

Groups: sex [?]

sex faculty.rank mean count 1 Female AssocProf 88513. 10 2 Female AsstProf 78050. 11 3 Female Prof 121968. 18 4 Male AssocProf 94870. 54 5 Male AsstProf 81311. 56 6 Male Prof 127121. 248

# linear model with ALL possible variables

salary_lm1 <- lm (salary ~ sex+faculty.rank+discipline+yrs.since.phd+yrs.service, data=salary_data)

summary(salary_lm1)

Call: lm(formula = salary ~ sex + faculty.rank + discipline + yrs.since.phd + yrs.service, data = salary_data)

Residuals: Min 1Q Median 3Q Max -65248 -13211 -1775 10384 99592

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 78862.8 4990.3 15.803 < 2e-16 sexMale 4783.5 3858.7 1.240 0.21584
faculty.rankAsstProf -12907.6 4145.3 -3.114 0.00198
faculty.rankProf 32158.4 3540.6 9.083 < 2e-16
disciplineB 14417.6 2342.9 6.154 1.88e-09 yrs.since.phd 535.1 241.0 2.220 0.02698
yrs.service -489.5 211.9 -2.310 0.02143 *
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1

Residual standard error: 22540 on 390 degrees of freedom Multiple R-squared: 0.4547, Adjusted R-squared: 0.4463 F-statistic: 54.2 on 6 and 390 DF, p-value: < 2.2e-16

# taking out yrs.since.phd, since it likely pretty much same as years.service; also makes no sense that the years of service would have a negative coefficient

salary_lm2 <- lm(salary ~ sex+faculty.rank+discipline+yrs.since.phd, data=salary_data)

AIC (salary_lm2) # 9097.22

[1] 9097.22

vif(salary_lm2) # all <4
              GVIF Df GVIF^(1/(2*Df))

sex 1.028359 1 1.014080 faculty.rank 1.987205 2 1.187301 discipline 1.055727 1 1.027486 yrs.since.phd 2.065517 1 1.437191

summary(salary_lm2)

Call: lm(formula = salary ~ sex + faculty.rank + discipline + yrs.since.phd, data = salary_data)

Residuals: Min 1Q Median 3Q Max -67451 -13860 -1549 10716 97023

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 80988.47 4931.84 16.422 < 2e-16 sexMale 4349.37 3875.39 1.122 0.26242
faculty.rankAsstProf -13104.15 4167.31 -3.145 0.00179
faculty.rankProf 32928.40 3544.40 9.290 < 2e-16
disciplineB 13937.47 2346.53 5.940 6.32e-09 * yrs.since.phd 61.01 127.01 0.480 0.63124
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1

Residual standard error: 22660 on 391 degrees of freedom Multiple R-squared: 0.4472, Adjusted R-squared: 0.4401 F-statistic: 63.27 on 5 and 391 DF, p-value: < 2.2e-16

# taking out yrs.service since that and faculty.rank are likely collinear
salary_lm3 <- lm(salary ~ faculty.rank + discipline + yrs.since.phd + sex + sex*faculty.rank, data=salary_data)

AIC(salary_lm3) # 9100.777

[1] 9100.777

vif(salary_lm3) # really high for faculty.rank and faculty.rank*sex
                  GVIF Df GVIF^(1/(2*Df))

faculty.rank 60.837395 2 2.792818 discipline 1.059384 1 1.029264 yrs.since.phd 2.077706 1 1.441425 sex 4.168178 1 2.041612 faculty.rank:sex 86.926719 2 3.053432

summary(salary_lm3)

Call: lm(formula = salary ~ faculty.rank + discipline + yrs.since.phd + sex + sex * faculty.rank, data = salary_data)

Residuals: Min 1Q Median 3Q Max -67531 -13938 -1215 10765 96921

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 79193.33 7654.84 10.346 < 2e-16 faculty.rankAsstProf -7848.08 10016.19 -0.784 0.433787
faculty.rankProf 33599.53 9015.90 3.727 0.000223
disciplineB 14028.24 2355.32 5.956 5.79e-09 *** yrs.since.phd 58.23 127.64 0.456 0.648510
sexMale 6464.05 7817.86 0.827 0.408839
faculty.rankAsstProf:sexMale -6308.13 10839.48 -0.582 0.560932
faculty.rankProf:sexMale -874.01 9603.83 -0.091 0.927535
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1

Residual standard error: 22710 on 389 degrees of freedom Multiple R-squared: 0.4478, Adjusted R-squared: 0.4379 F-statistic: 45.07 on 7 and 389 DF, p-value: < 2.2e-16

plot(salary_lm3)

# taking out yrs.since.phd and keeping yrs.service
salary_lm4<- lm (salary~discipline+yrs.service+sex, data= salary_data)

AIC(salary_lm4) # 9257.162

[1] 9257.162

vif(salary_lm4) # all < 4

discipline yrs.service sex 1.028760 1.053649 1.025117

summary(salary_lm4)

Call: lm(formula = salary ~ discipline + yrs.service + sex, data = salary_data)

Residuals: Min 1Q Median 3Q Max -77424 -19404 -4635 15539 105391

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 84361.1 4941.4 17.072 < 2e-16 disciplineB 13033.8 2840.3 4.589 6.01e-06 yrs.service 832.2 110.2 7.550 3.07e-13 *** sexMale 8423.3 4744.5 1.775 0.0766 .
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1

Residual standard error: 27790 on 393 degrees of freedom Multiple R-squared: 0.1646, Adjusted R-squared: 0.1582 F-statistic: 25.81 on 3 and 393 DF, p-value: 2.928e-15

stargazer(salary_lm1, salary_lm2, salary_lm3, salary_lm4, type = "html")
Dependent variable:
salary
(1) (2) (3) (4)
sexMale 4,783.493 4,349.366 6,464.051 8,423.335*
(3,858.668) (3,875.393) (7,817.858) (4,744.537)
faculty.rankAsstProf:sexMale -6,308.131
(10,839.480)
faculty.rankProf:sexMale -874.006
(9,603.828)
faculty.rankAsstProf -12,907.590*** -13,104.150*** -7,848.084
(4,145.278) (4,167.315) (10,016.190)
faculty.rankProf 32,158.410*** 32,928.400*** 33,599.530***
(3,540.647) (3,544.403) (9,015.904)
disciplineB 14,417.630*** 13,937.470*** 14,028.240*** 13,033.850***
(2,342.875) (2,346.534) (2,355.315) (2,840.349)
yrs.since.phd 535.058** 61.011 58.228
(240.994) (127.010) (127.640)
yrs.service -489.516** 832.154***
(211.938) (110.215)
Constant 78,862.820*** 80,988.470*** 79,193.330*** 84,361.070***
(4,990.326) (4,931.844) (7,654.839) (4,941.371)
Observations 397 397 397 397
R2 0.455 0.447 0.448 0.165
Adjusted R2 0.446 0.440 0.438 0.158
Residual Std. Error 22,538.650 (df = 390) 22,663.240 (df = 391) 22,708.750 (df = 389) 27,789.810 (df = 393)
F Statistic 54.195*** (df = 6; 390) 63.266*** (df = 5; 391) 45.071*** (df = 7; 389) 25.810*** (df = 3; 393)
Note: p<0.1; p<0.05; p<0.01